keep = ls()


##################################
#Get Indiana Regiment Enlistments#
##################################

#Load CWDB data for people serving in Indiana regiments
in_veterans_cwdb = fread(paste0(cwdb_path, '/person_table.csv')) %>% 
  .[stateserved %in% "IN"]
#Select roster for people serving in Indiana regiments
in_veterans_roster = fread(paste0(cwdb_path, '/regiment_roster_table.csv')) %>% 
  .[personpk %in% in_veterans_cwdb$personpk]
#Load Regiments
in_regiments = fread(paste0(cwdb_path, '/regiments.csv'))

#merge persons to regiment rosters
setkey(in_veterans_cwdb, personpk)
setkey(in_veterans_roster, personpk)
in_veterans_cwdb_roster = in_veterans_roster[in_veterans_cwdb]

##Merge together soldiers and regimental identifiers
setkey(in_veterans_cwdb_roster, regimentpk)
setkey(in_regiments, regimentpk)

roster_match_cwdb = in_regiments[in_veterans_cwdb_roster]

#Select full roster
roster_match_cwdb = roster_match_cwdb[, list(personpk, 
                                             firstname, 
                                             lastname,
                                             midname,
                                             residence,
                                             state,
                                             regimentpk, 
                                             fullname, 
                                             regimentnumber, 
                                             unitcompany, 
                                             mustindate,
                                             enlistdate,
                                             percent_killed,
                                             percent_disabled,
                                             percent_diedofdisease,
                                             percent_diedpow)]

#Select roster of men in Indiana regiments
roster_match_cwdb_in = roster_match_cwdb[str_detect(fullname, "Indiana"), 
                                         list(personpk, 
                                              firstname,
                                              midname,
                                              lastname,
                                              regiment_number = regimentnumber,
                                              regimentpk,
                                              residence,
                                              company = unitcompany %>% str_remove_all("\\W"),
                                              muster_date = mustindate %>% 
                                                strptime(format = "%Y-%m-%d") %>%
                                                as.Date %>%
                                                as.numeric,
                                              enlist_year = enlistdate %>% 
                                                strptime(format = "%Y-%m-%d") %>%
                                                year()
                                         )]

#Clean Company
roster_match_cwdb_in[company %in% c("", "U", "S", "Band"), company := NA]

#Clean Name
roster_match_cwdb_in[, paste("match", 
                             c("first", "middle", "last", "first_clean"), 
                             sep = "_") := clean_names(first = firstname,
                                                       middle = midname,
                                                       last = lastname
                             )]
#Clean initials
roster_match_cwdb_in[, match_middle_init := str_sub(match_middle, 1, 1)]
roster_match_cwdb_in[, match_first_init := str_sub(match_first, 1, 1)]
roster_match_cwdb_in[, match_last_init := str_sub(match_last, 1, 1)]

#Name metaphone
roster_match_cwdb_in[, first_sound := metaphone(match_first)]
roster_match_cwdb_in[, last_sound := metaphone(match_last)]


#####################################
#Matching to 1874 Directory Counties#
#####################################

#Load CWDB residence/Indiana county crosswalk
cwdb_residence_xwalk = fread( "./raw/cwdb_residence_county_xwalk.csv")

#Set quality of matches based on number of counties w/ residence/location
cwdb_residence_xwalk[, match_quality := ifelse(n_matches == 1, 1, 
                                               ifelse(n_matches == n_use_matches, 2, 3))]
cwdb_residence_xwalk = cwdb_residence_xwalk[(use_county), list(cwdb_residence, census_county, match_quality, n_matches, n_use_matches)] %>% unique

#Merge in residence to county crosswalk
setkey(in_veterans_cwdb, residence)
setkey(cwdb_residence_xwalk, cwdb_residence)
to_match_cwdb = cwdb_residence_xwalk[in_veterans_cwdb, allow.cartesian = T] %>%
  .[, 
    list(personpk,
         census_county,
         cwdb_residence,
         match_quality,
         use_quality = as.numeric(NA),
         n_matches,
         n_use_matches,
         survivedwar
    )] 

#Link to roster to clarify match quality 2 or 3
setkey(roster_match_cwdb_in, residence)
setkey(cwdb_residence_xwalk, cwdb_residence)
county_by_company = cwdb_residence_xwalk[roster_match_cwdb_in, allow.cartesian = T] %>% 
  .[, list(
    co_n = personpk %>% unique %>% length(),
    census_county, 
    match_quality,
    residence_na = sum(cwdb_residence %in% "")
  ), by = list(regimentpk, company)] %>%
  .[, list(mq1 = sum(match_quality %in% 1),
           mq2 = sum(match_quality %in% 2),
           mq3 = sum(match_quality %in% 3)
  ), by = list(regimentpk, company, census_county, co_n, residence_na)] %>%
  .[!is.na(census_county)] 

#mq2: resolve multiple directory county matches to county with plurality of exact matches
county_by_company[, mq2_use := mq2 > 0]
county_by_company[mq2 > 0, mq2_use := (mq1 == max(mq1)) & mq1 > 0, by = list(regimentpk, company)]

#mq3:  resolve multiple county matches to county with majority of exact matches
county_by_company[, mq3_use := mq3 > 0 & mq1 > 0 & (mq1/(co_n - mq2))>0.5]

#residence_na: resolve missing residence to county with majority of exact matches
county_by_company[, rna_use := residence_na > 0 & mq1 > 0 & (mq1/(co_n - mq2))>0.5 ]


#mq1: exact matches
to_match_cwdb[(match_quality %in% 1), use_quality := 1]

#Assign county to county with plurality of exact matches w/in company:
for (i in seq_along(county_by_company[(mq2_use), regimentpk])) {
  temp = county_by_company[(mq2_use)][i]
  i_persons = roster_match_cwdb_in[regimentpk %in% temp$regimentpk & company %in% temp$company, personpk]
  to_match_cwdb[personpk %in% i_persons &
                  match_quality %in% 2 &
                  (census_county %in% temp$census_county),
                use_quality := 2
                ]
}

to_match_cwdb[is.na(use_quality) & match_quality %in% 2, use_quality := 2]

#Assign county to county with majority of exact matches w/in company:
for (i in seq_along(county_by_company[(mq3_use), regimentpk])) {
  temp = county_by_company[(mq3_use)][i]
  i_persons = roster_match_cwdb_in[regimentpk %in% temp$regimentpk & company %in% temp$company, personpk]
  to_match_cwdb[personpk %in% i_persons &
                  match_quality %in% 3 &
                  (census_county %in% temp$census_county),
                use_quality := 3
                ]
}

#Fix residence NA: resolve missing residence to county with majority of exact matches
for (i in seq_along(county_by_company[(rna_use), regimentpk])) {
  temp = county_by_company[(rna_use)][i]
  i_persons = roster_match_cwdb_in[regimentpk %in% temp$regimentpk & company %in% temp$company, personpk]
  to_match_cwdb[personpk %in% i_persons &
                  cwdb_residence %in% "",
                c("census_county", 'use_quality') := list(temp$census_county, 4)
                ]
}

#Match quality 5: multiple county matches (<=4), unresolved
to_match_cwdb[, no_q_flag := all(is.na(use_quality)) , by = list(cwdb_residence, personpk)]
to_match_cwdb[(no_q_flag) & match_quality %in% 3 & n_matches <= 4, use_quality := 5]

#QUality 6: multiple county matches (>4), unresolved
to_match_cwdb[, no_q_flag := all(is.na(use_quality)) , by = list(cwdb_residence, personpk)]
to_match_cwdb[(no_q_flag) & is.na(use_quality) & !is.na(census_county), use_quality := 6 ]

#Soldiers linked to 1874 Directory County
to_match_cwdb = to_match_cwdb[!is.na(use_quality), list(personpk, cwdb_residence, census_county, use_quality, survivedwar)]


#############################
#Clean Ancestry Indiana Data#
#############################

con <- dbConnect(RSQLite::SQLite(), paste0(ancestry_path, "/incw.db"))
db = dbReadTable(con, 'persons') %>% as.data.table
dbDisconnect(con)

db[, birth_y := str_extract(birth_year, "\\d{4}") %>% as.numeric]
db[, enlist_y := str_extract(enrollment_date, "\\d{4}") %>% as.numeric]
db[, age_at_enlist := enlist_y - birth_y]

#change regiment_number: infantry designation to cavalry
ancestry_change = data.table(regiment = c(28,39,41,45,77,90,71,119, 121,125,126,127,131) %>% as.character, 
                             cav_regiment = c(1, 8, 2, 3, 4, 5, 6, 7,   9,  10, 11, 12, 13) %>% as.character)

setkey(db, regiment)
setkey(ancestry_change, regiment)
db = ancestry_change[db]
db[!is.na(cav_regiment), regiment := cav_regiment]

#Extract names
#fix St.
db[, name := name %>% str_replace("(?<=\\sSt[e]?)[.]? ", "")]
#fix Mc
db[, name := name %>% str_replace("(?<=\\sM[a]?[Cc]) (?=[A-Za-z])", "")]
#Fix O
db[, name := name %>% str_replace("(?<=\\sO)[' ](?=[A-Za-z])", "")]

#Extract first, middle, last name
db[, first_middle_name := str_extract(name, ".+(?=\\s[A-Za-z]+$)")]
db[, last_name := name %>% str_extract("[A-Za-z]+?$")]

#Clean Names
db[, paste("match", 
           c("first", "middle", "last", "first_clean"), 
           sep = "_") := clean_names(first = first_middle_name,
                                     middle = NULL,
                                     last = last_name
           )]

#Clean ancestry data for matching
ancestry_match = db[, list(match_first,
                           match_first_clean,
                           match_middle,
                           match_last,
                           birth_year = birth_y,
                           muster_date = strptime(enrollment_date, format = "%d %b %Y") %>% as.Date %>% as.numeric,
                           enlist_year = enlist_y,
                           regiment_number = regiment, 
                           company = company %>% str_to_upper,
                           ancestry_id = id)]
ancestry_match[, middle_initial := match_middle %>% str_sub(1,1)]
ancestry_match[, first_sound := metaphone(match_first)]
ancestry_match[, last_sound := metaphone(match_last)]


############################################
#Merge CWDB to Indiana Roster from Ancestry#
############################################

#Blcok by regiment
block_out <- blockData(roster_match_cwdb_in, ancestry_match, varnames = "regiment_number")

#Save matches to list
matches_list = vector(mode = 'list', length = length(block_out))

#Iterate over regiments
#match on: first name, last name, middle name, company, muster in date, last name sound, first name sound
for (i in seq_along(block_out)) {
  print(i)
  x = block_out[[i]]
  out = try(
    fastLink(dfA = roster_match_cwdb_in[x$dfA.inds,],
             dfB = ancestry_match[x$dfB.inds,],
             varnames = c("match_first_clean", "match_last", 'match_middle', 'company', 'muster_date', "last_sound", "first_sound"),
             stringdist.match = c("match_first_clean", "match_last"),
             numeric.match = c('muster_date'),
             partial.match = c("match_first_clean", "match_last", 'muster_date'),
             cut.a.num = 30,
             cut.p.num = 60,
             cut.a = 0.94,
             cut.p = 0.85,
             n.cores = 11,
             cond.indep = T,
             return.all = T
    )
  )
  if (inherits(out, "try-error")) {
    out = NULL
    print("DID NOT RUN")
  }
  matches_list[[i]] = out
}

#Pull out matches
m_out = mapply(function(x,m) getMatches(roster_match_cwdb_in[x$dfA.inds,], 
                                        ancestry_match[x$dfB.inds,],
                                        m, threshold.match = 0.01) %>% as.data.table,
               x = block_out, m = matches_list, SIMPLIFY = F)

#Combine matched results
matched_cwdb_ancestry = m_out %>% rbindlist

###################################
#Save CWDB Indiana Soldiers to Use#
###################################

#Subset to Indiana soldiers linked to 1874 directory counties
cwdb_census_match = roster_match_cwdb_in[personpk %in% to_match_cwdb[, personpk], list(personpk, regimentpk, regiment_number, company, enlist_year,
                                                                                       match_first, match_middle, match_last, match_first_clean, 
                                                                                       match_middle_init, first_sound, last_sound)]
#Select birth-year matches from Ancestry data
#if first name is partial or sound match 
#and last name is partial or sound match
cwdb_birth_years = matched_cwdb_ancestry[((gamma.1 %in% 1:2) | (gamma.7 %in% 2)) & 
                                           ((gamma.2 %in% 1:2) | (gamma.6 %in% 2)),
                                         list(personpk, birth_year)] %>% unique()


#Merge birth years in
setkey(cwdb_census_match, personpk)
setkey(cwdb_birth_years, personpk)

cwdb_census_match = cwdb_birth_years[cwdb_census_match]

#Merge counties in
setkey(to_match_cwdb, personpk)
setkey(cwdb_census_match, personpk)

cwdb_census_match = to_match_cwdb[cwdb_census_match]

#Save cleaned CWDB matching data
fwrite(cwdb_census_match, "./cleaned/cwdb_to_match.csv")

#Cleanup
rm(list = setdiff(ls(), c(keep, 'keep')))
gc()